1 Pre-processing

1.1 Load packages

library("RColorBrewer")
library(Signac)
library(Seurat)
library(GenomicRanges)
library(future)
#library(SeuratWrappers)
library(harmony)
library(EnsDb.Hsapiens.v86)
library(stringr)
library(dplyr)
library(ggplot2)
library(patchwork)
library(kableExtra)

set.seed(173)

(ggpubr)[http://rpkgs.datanovia.com/ggpubr/reference/index.html]

1.2 Parameters

# Paths
path_to_obj <- here::here("~/Documents/multiome_tonsil_Lucia/results/R_objects/10.tonsil_multiome_integrated_without_doublets.rds")
path_to_save <- here::here("~/Documents/multiome_tonsil_Lucia/results/R_objects/11.tonsil_multiome_integrated_bcell_normalized.rds")

# Thresholds
max_doublet_score_rna <- 0.3

1.3 Load data

tonsil_wnn_without_doublet <- readRDS(path_to_obj)

1.4 scATAC

1.4.1 Normalization and linear dimensional reduction

We exclude the first dimension as this is typically correlated with sequencing depth Cells cluster completely separately in ATAC without harmony; so run harmony after SVD

RunSVD LSI

DefaultAssay(tonsil_wnn_without_doublet) <- "peaks"
tonsil_wnn_without_doublet <- RunTFIDF(tonsil_wnn_without_doublet)
## Performing TF-IDF normalization
tonsil_wnn_without_doublet <- FindTopFeatures(tonsil_wnn_without_doublet, min.cutoff = "q0")
tonsil_wnn_without_doublet <- RunSVD(tonsil_wnn_without_doublet)
## Running SVD
## Scaling cell embeddings

1.4.2 Plot the Depth correlation plot

Compute the correlation between total counts and each reduced dimension component.

LSI component is typically highly correlated with sequencing depth. The first LSI component often captures sequencing depth (technical variation) rather than biological variation. If this is the case, the component should be removed from downstream analysis. We can assess the correlation between each LSI component and sequencing depth using the DepthCor() function:

For scRNA-seq data we don’t typically observe such a strong relationship between the first PC and sequencing depth, and so usually retain the first PC in downstream analyses.

DepthCor(tonsil_wnn_without_doublet)

Here we see there is a very strong correlation between the first LSI component and the total number of counts for the cell, so we will perform downstream steps without this component.

1.4.3 UMAP representation

  • dimensional reduction key, specifies the string before the number for the dimension names. UMAP by default
  • reduction.name: Name to store dimensional reduction under in the Seurat object
tonsil_wnn_without_doublet <- RunUMAP(
  tonsil_wnn_without_doublet,
  dims = 2:40,
  reduction = "lsi",
  reduction.name = "umap.atac",
  reduction.key = "atacUMAP_"
)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 21:39:42 UMAP embedding parameters a = 0.9922 b = 1.112
## 21:39:43 Read 66390 rows and found 39 numeric columns
## 21:39:43 Using Annoy for neighbor search, n_neighbors = 30
## 21:39:43 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 21:39:49 Writing NN index file to temp file /var/folders/kz/y10np0cj213fybqz181z9lghbzr42p/T//RtmpV3jxpN/file452160cb1dbb
## 21:39:50 Searching Annoy index using 1 thread, search_k = 3000
## 21:40:17 Annoy recall = 100%
## 21:40:19 Commencing smooth kNN distance calibration using 1 thread
## 21:40:22 Initializing from normalized Laplacian + noise
## 21:40:25 Commencing optimization for 200 epochs, with 2918692 positive edges
## 21:41:06 Optimization finished
atac.umap<-DimPlot(
  tonsil_wnn_without_doublet,
  reduction = "umap.atac",
  group.by = "library_name",
  pt.size = 0.1
) + ggtitle('scATAC UMAP') + NoLegend()

atac.umap

#split_by: library ,edad, genero

1.5 scRNA

1.5.1 Normalization and linear dimensional reduction-

1.5.2 NormalizeData

DefaultAssay(tonsil_wnn_without_doublet) <- "RNA"
tonsil_wnn_without_doublet <- NormalizeData(
  tonsil_wnn_without_doublet,
  normalization.method = "LogNormalize",
  scale.factor = 1e4
)

tonsil_wnn_without_doublet <- tonsil_wnn_without_doublet %>%
  FindVariableFeatures(nfeatures = 3000) %>%
  ScaleData() %>% 
  RunPCA() 
PCAPlot(tonsil_wnn_without_doublet,
  group.by = "library_name")

ElbowPlot(object = tonsil_wnn_without_doublet)

find variable genes

1.5.3 UMAP representation

tonsil_wnn_without_doublet <- RunUMAP(
  tonsil_wnn_without_doublet,
  dims = 1:30,
  reduction = "pca",
  reduction.name = "umap.rna",
  reduction.key = "rnaUMAP_"
)
## 21:43:20 UMAP embedding parameters a = 0.9922 b = 1.112
## 21:43:20 Read 66390 rows and found 30 numeric columns
## 21:43:20 Using Annoy for neighbor search, n_neighbors = 30
## 21:43:20 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 21:43:27 Writing NN index file to temp file /var/folders/kz/y10np0cj213fybqz181z9lghbzr42p/T//RtmpV3jxpN/file45215194158f
## 21:43:27 Searching Annoy index using 1 thread, search_k = 3000
## 21:43:50 Annoy recall = 100%
## 21:43:52 Commencing smooth kNN distance calibration using 1 thread
## 21:43:55 Initializing from normalized Laplacian + noise
## 21:44:00 Commencing optimization for 200 epochs, with 2953672 positive edges
## 21:44:42 Optimization finished
rna.umap<-DimPlot(
  tonsil_wnn_without_doublet,
  reduction = "umap.rna",
  group.by = "library_name",
  pt.size = 0.1) + NoLegend() + ggtitle('scRNA UMAP')

rna.umap

atac.umap + rna.umap

hacer primero harmony, quitar batch effect. atac y rna. harmony

2 Run Harmony

Pass the Seurat object to the RunHarmony function and specify which variable to integrate out. A Seurat object is returned with corrected Harmony coordinates.

2.1 scATAC

DefaultAssay(tonsil_wnn_without_doublet) <- "peaks"
tonsil_wnn_without_doublet <- RunHarmony(
  object = tonsil_wnn_without_doublet,
  reduction = "lsi",
  dims = 2:40,
  group.by.vars = "library_name",
  assay.use = "peaks",
  project.dim = FALSE,
  reduction.save = "harmony_peaks"
)
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony 5/10
## Harmony 6/10
## Harmony 7/10
## Harmony 8/10
## Harmony converged after 8 iterations

2.1.1 UMAP representation

tonsil_wnn_without_doublet <- RunUMAP(
  tonsil_wnn_without_doublet,
  dims = 2:40,
  reduction = "harmony_peaks",
  reduction.name = "umap.atac",
  reduction.key = "atacUMAP_"
)
## 21:49:49 UMAP embedding parameters a = 0.9922 b = 1.112
## 21:49:49 Read 66390 rows and found 39 numeric columns
## 21:49:49 Using Annoy for neighbor search, n_neighbors = 30
## 21:49:49 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 21:49:56 Writing NN index file to temp file /var/folders/kz/y10np0cj213fybqz181z9lghbzr42p/T//RtmpV3jxpN/file45211cd3ba62
## 21:49:56 Searching Annoy index using 1 thread, search_k = 3000
## 21:50:26 Annoy recall = 100%
## 21:50:27 Commencing smooth kNN distance calibration using 1 thread
## 21:50:30 Initializing from normalized Laplacian + noise
## 21:50:34 Commencing optimization for 200 epochs, with 3014858 positive edges
## 21:51:16 Optimization finished
Harm_peak<-DimPlot(
  tonsil_wnn_without_doublet,
  reduction = "umap.atac",
  group.by = "library_name",
  pt.size = 0.1
) + NoLegend() + ggtitle('Peak Harmony')

3 scRNA

tonsil_wnn_without_doublet <- RunUMAP(
  tonsil_wnn_without_doublet,
  dims = 2:40,
  reduction = "harmony_rna",
  reduction.name = "umap.rna",
  reduction.key = "rnaUMAP_"
)
## 21:54:53 UMAP embedding parameters a = 0.9922 b = 1.112
## 21:54:53 Read 66390 rows and found 39 numeric columns
## 21:54:53 Using Annoy for neighbor search, n_neighbors = 30
## 21:54:53 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 21:55:00 Writing NN index file to temp file /var/folders/kz/y10np0cj213fybqz181z9lghbzr42p/T//RtmpV3jxpN/file45214288fe7d
## 21:55:00 Searching Annoy index using 1 thread, search_k = 3000
## 21:55:26 Annoy recall = 100%
## 21:55:27 Commencing smooth kNN distance calibration using 1 thread
## 21:55:31 Initializing from normalized Laplacian + noise
## 21:55:35 Commencing optimization for 200 epochs, with 3049358 positive edges
## 21:56:18 Optimization finished
Harm_rna<-DimPlot(
  tonsil_wnn_without_doublet,
  reduction = "umap.rna",
  group.by = "library_name",
  pt.size = 0.1
) + NoLegend() + ggtitle('RNA Harmony')

3.1 scATAC and RNAseq Harmony

Harm_peak+Harm_rna + plot_annotation(title = 'Harmony ATAC and RNA UMAP visualization')

3.2 Joint UMAP visualization

FindNeighbors

Constructs a Shared Nearest Neighbor (SNN) Graph for a given dataset. We first determine the k-nearest neighbors of each cell. We use this knn graph to construct the SNN graph by calculating the neighborhood overlap (Jaccard index) between every cell and its k.param nearest neighbors.

# build a joint neighbor graph using both assays
tonsil_wnn_without_doublet <- FindMultiModalNeighbors(
  object = tonsil_wnn_without_doublet,
  reduction.list = list("harmony_peaks", "harmony_rna"),
  dims.list = list(2:40, 1:30), modality.weight.name = "Joint_snn_umap"
  )
## Calculating cell-specific modality weights
## Finding 20 nearest neighbors for each modality.
## Calculating kernel bandwidths
## Warning in FindMultiModalNeighbors(object = tonsil_wnn_without_doublet, :
## The number of provided modality.weight.name is not equal to the number of
## modalities. peaks.weight RNA.weight are used to store the modality weights
## Finding multimodal neighbors
## Constructing multimodal KNN graph
## Constructing multimodal SNN graph
# build a joint UMAP visualization

tonsil_wnn_without_doublet <- RunUMAP(
  object = tonsil_wnn_without_doublet,
  nn.name = "weighted.nn",
  reduction.name = "wnn.umap",
  reduction.key = "wnnUMAP_")
## 22:00:27 UMAP embedding parameters a = 0.9922 b = 1.112
## 22:00:29 Commencing smooth kNN distance calibration using 1 thread
## 22:00:32 Initializing from normalized Laplacian + noise
## 22:00:36 Commencing optimization for 200 epochs, with 2156298 positive edges
## 22:01:21 Optimization finished
joint.umap<- DimPlot(tonsil_wnn_without_doublet, label = FALSE, group.by = "library_name", pt.size = 0.1,  reduction = "wnn.umap") + plot_annotation(title = 'Joint UMAP')+ ggtitle('Joint UMAP by library name') + NoLegend()

joint.umap

joint.umap_age<- DimPlot(tonsil_wnn_without_doublet, label = FALSE, split.by = "age_group", pt.size = 0.1,  reduction = "wnn.umap") + plot_annotation(title = 'Joint UMAP ')+ ggtitle('Joint UMAP age ') 

joint.umap_hospital<- DimPlot(tonsil_wnn_without_doublet, label = FALSE, split.by = "hospital", pt.size = 0.1,  reduction = "wnn.umap") + plot_annotation(title = 'Joint UMAP ')+ ggtitle('Joint UMAP age ') 

joint.umap_age

joint.umap_hospital

3.3 FindClusters

#find cluster algorithm 3 = SLM algorithm

tonsil_wnn_without_doublet <- FindClusters(tonsil_wnn_without_doublet, resolution = c(0.005,0.01,0.05,0.075),algorithm = 3, graph.name = "wsnn",verbose = FALSE)
print(colnames(tonsil_wnn_without_doublet@meta.data))
##  [1] "lib_name_barcode"      "orig.ident"            "nCount_RNA"           
##  [4] "nFeature_RNA"          "nCount_ATAC"           "nFeature_ATAC"        
##  [7] "nucleosome_signal"     "nucleosome_percentile" "TSS.enrichment"       
## [10] "TSS.percentile"        "tss.level"             "percent.mt"           
## [13] "percent_ribo"          "nCount_peaks"          "nFeature_peaks"       
## [16] "library_name"          "donor_id"              "sex"                  
## [19] "age"                   "age_group"             "hospital"             
## [22] "assay"                 "barcodes"              "doublet_scores"       
## [25] "predicted_doublets"    "peaks.weight"          "RNA.weight"           
## [28] "wsnn_res.0.005"        "wsnn_res.0.01"         "seurat_clusters"      
## [31] "sub.cluster_0.25"      "sub.cluster0_0.5"      "is_doublet"           
## [34] "wsnn_res.0.05"         "wsnn_res.0.75"         "wsnn_res.0.075"
vars <- str_subset(colnames(tonsil_wnn_without_doublet@meta.data), "^wsnn_res")
clusters_gg <- purrr::map(vars, function(x) {
  p <- DimPlot(
    tonsil_wnn_without_doublet,
    group.by = x,
    reduction = "wnn.umap",
    pt.size = 0.1, label = T
  )
  p 
})
clusters_gg
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

Idents(tonsil_wnn_without_doublet)<-"wsnn_res.0.05"
tonsil_markers_05<-FindAllMarkers(object = tonsil_wnn_without_doublet, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7

3.3.1 Get top n for each cluster

Resolution 0.01

tonsil_markers_05 %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC) %>% write.csv(.,file=paste0("~/Documents/multiome_tonsil_Lucia/results/tables/", "top10_tonsil_markers_no_doublets_05.csv"))

tonsil_markers_05 %>% group_by(cluster) %>% top_n(n = 5, wt = avg_log2FC) %>% write.csv(.,file=paste0("~/Documents/multiome_tonsil_Lucia/results/tables/", "top5_tonsil_markers_no_doublets_05.csv"))


top5_tonsil_markers_05<-tonsil_markers_05 %>% group_by(cluster) %>% top_n(n = 5, wt = avg_log2FC)
top10_tonsil_markers_05<-tonsil_markers_05 %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
df_top5<-as.data.frame(top5_tonsil_markers_05)
kbl(df_top5,caption = "Table of the top 5 marker of each cluster resolution 0.005") %>%
  kable_paper("striped", full_width = F)
Table 1: Table of the top 5 marker of each cluster resolution 0.005
p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
0 3.107693 0.993 0.428 0 0 BANK1
0 2.508477 0.619 0.107 0 0 COL19A1
0 2.262911 0.951 0.456 0 0 ARHGAP24
0 2.214714 0.815 0.338 0 0 ADAM28
0 2.113625 0.876 0.384 0 0 MARCH1
0 3.210898 0.909 0.120 0 1 INPP4B
0 3.085975 0.933 0.131 0 1 FYB1
0 2.824967 0.622 0.067 0 1 LEF1
0 2.674320 0.736 0.152 0 1 IL6ST
0 2.663301 0.715 0.081 0 1 IL7R
0 2.958610 0.932 0.151 0 2 HMGB2
0 2.887534 0.949 0.224 0 2 TUBA1B
0 2.613753 0.947 0.252 0 2 H2AFZ
0 2.497528 0.796 0.021 0 2 TOP2A
0 2.462446 0.917 0.086 0 2 STMN1
0 2.944159 0.984 0.213 0 3 AC023590.1
0 2.785879 0.834 0.216 0 3 MAML3
0 2.620607 0.930 0.163 0 3 RAPGEF5
0 2.606568 0.820 0.110 0 3 AC104170.1
0 2.377730 0.926 0.186 0 3 AFF2
0 3.634883 0.742 0.026 0 4 CCL5
0 3.252688 0.791 0.158 0 4 AOAH
0 2.901600 0.189 0.004 0 4 GNLY
0 2.662984 0.559 0.011 0 4 NKG7
0 2.536666 0.515 0.044 0 4 DTHD1
0 5.974981 0.648 0.243 0 5 IGHG1
0 6.002027 0.835 0.415 0 5 IGLC1
0 6.156868 0.964 0.892 0 5 IGKC
0 6.496845 0.678 0.408 0 5 IGHA1
0 6.068831 0.901 0.718 0 5 IGLC2
0 4.322800 0.764 0.041 0 6 SLC8A1
0 3.986308 0.714 0.008 0 6 LYZ
0 3.382540 0.721 0.005 0 6 PLXDC2
0 3.338779 0.282 0.020 0 6 S100A9
0 3.476821 0.173 0.014 0 6 SPRR3
0 4.400369 0.976 0.059 0 7 LINC01374
0 3.957180 0.920 0.008 0 7 LINC01478
0 3.698304 0.928 0.010 0 7 FAM160A1
0 3.453543 0.992 0.186 0 7 RUNX2
0 3.753866 0.992 0.534 0 7 TCF4

4 Markers exploration

MARKERS

Immature B cells express CD19, CD 20, CD34, CD38, and CD45R, T-cell receptor/CD3 complex (TCR/CD3 complex)

  • T-cells (identified by high expression of CD3D and CD3E).
  • monocytes (identified by high expression of LYZ and S100A8).
  • naive B-cells (identified by high expression of CD79A, CD79B and BLNK).
  • plasma cells (identified by B-cell and proliferation markers, such as TOP2A or MKI67).
  • poor-quality cells (identified by high mitochondrial expression). If a cell has pores in the membrane due to cell lysis, the cytosolic mRNA will leak out of the cell; but if the diameter of mitochondria is higher than the pores, they will get trapped inside the cell.

DZ: SUGCT, CXCR4, AICDA

LZ: CD83, BCL2A1

GC total: MEF2B, BCL6, IRF4

PC: PRDM1, SLAMF7, MZB1, FKBP11

canonical_bcell_markers <-c("CD34", "CD38", "CD19")

monocytes_markers<-c("LYZ","S100A8")

naive_markers<-c("CD79A", "CD79B", "BLNK")

bib_Bcell_markers<-c("CD19","CR2","MS4A1","RALGPS2","CD79A")
bib_Tcell_markers<-c("CD3E","CD4","CD8A","FOXP3","IL17A")

markers_bcell<-c("BANK1","ARHGAP24","ADAM28","MARCH1","RAPGEF5","AFF2","RGS13","LPP","IGHG1","IGLC1","SLC8A1","LYZ","PLXDC2","FAM160A1","IGHA1","IGLC2", "SETBP1","ENTPD1","COL19A1","CCSER1")

markers_tcell<-c("INPP4B","FYB1","LEF1","IL7R","IL6ST","CCL5","GNLY","NKG7","DTHD1","RUNX2", "FOXP3","CD8A","IL17A","CD2")

4.1 Bibliography markers

4.1.1 B cells

markers_gg <- purrr::map(bib_Bcell_markers, function(x) {
  
  p <- FeaturePlot(
    tonsil_wnn_without_doublet,
    features = x,
    reduction = "wnn.umap",
    pt.size = 0.1
  )
  p
})
markers_gg
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

markers_gg <- purrr::map(naive_markers, function(x) {
  
  p <- FeaturePlot(
    tonsil_wnn_without_doublet,
    features = x,
    reduction = "wnn.umap",
    pt.size = 0.1
  )
  p
})
markers_gg
## [[1]]

## 
## [[2]]

## 
## [[3]]

4.1.2 T-cells

CD8+ T cell markers:“CD3D”, “CD8A” NK cell markers:“GNLY”, “NKG7”

markers_gg <- purrr::map(bib_Tcell_markers, function(x) {
  
  p <- FeaturePlot(
    tonsil_wnn_without_doublet,
    features = x,
    reduction = "wnn.umap",
    pt.size = 0.1
  )
  p
})
markers_gg
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

4.1.3 B-cells

m<-c("PRDM1","XBP1","IRF4","MEF2B","BCL6")

DZ<-c("SUGCT", "CXCR4", "AICDA")

LZ<- c("CD83","BCL2A1")

GC<- c("MEF2B", "BCL6","IRF4")

PC<- c("PRDM1","SLAMF7", "MZB1", "FKBP11")


markers_gg <- purrr::map(m, function(x) {
  
  p <- FeaturePlot(
    tonsil_wnn_without_doublet,
    features = x,
    reduction = "wnn.umap",
    pt.size = 0.1
  )
  p
})
markers_gg
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

markers_gg <- function(x){purrr::map(x, function(x) {
  
  p <- FeaturePlot(
    tonsil_wnn_without_doublet,
    features = x,
    reduction = "wnn.umap",
    pt.size = 0.1
  )
  p
})}
markers_gg(DZ)
## [[1]]

## 
## [[2]]

## 
## [[3]]

markers_gg(LZ)
## [[1]]

## 
## [[2]]

markers_gg(GC)
## [[1]]

## 
## [[2]]

## 
## [[3]]

markers_gg(PC)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

5 Save

saveRDS(tonsil_wnn_without_doublet, path_to_save)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.